//===--- ARCCodeMotion.cpp - PIL ARC Code Motion --------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
///
/// \file
///
/// This pass moves retains down and releases up. This, hopefully, will help
/// ARC sequence opt to remove retain and release pairs without worrying too
/// much about control flows.
///
/// It uses an optimistic iterative data flow to compute where to insert the
/// retains and releases for every reference-counted root. It then removes all
/// the old retain and release instructions and create the new ones.
///
/// This pass is more sophisticated than PILCodeMotion, as arc optimizations
/// can be very beneficial, use an optimistic global data flow to achieve
/// optimality.
///
/// Proof of Correctness:
/// -------------------
///
/// 1. Retains are blocked by MayDecrements. Its straightforward to prove that
/// retain sinking is correct.
///
/// If a retain is sunk from Region A to Region B, that means there is no
/// blocking operation between where the retain was in Region A to where it is
/// sunk to in Region B. Since we only sink retains (we do not move any other
/// instructions) which themselves are NOT MayDecrement operations, and moving
/// retains can't turn non-decrement instruction MayDecrement.
///
/// 2. Releases are blocked by MayInterfere. If a release is hoisted from
/// Region B to Region A, that means there is no blocking operation from where
/// the release was in Region B and where the release is hoisted to in Region A.
///
/// The question is whether we can introduce such operation while we hoist
/// other releases. The answer is NO. because if such releases exist, they
/// would be blocked by the old release (we remove old release and recreate new
/// ones at the end of the pass) and will not be able to be hoisted beyond the
/// old release.
///
/// This proof also hinges on the fact that if release A interferes with
/// releases B then release B must interfere with release A. i.e. the 2
/// releases must have the symmetric property. Consider the 2 releases as 2
/// function calls, i.e. CallA (release A) and CallB (release B), if CallA
/// interferes with CallB, that means CallA must share some program states
/// (through read or write) with CallB. Then it is not possible for CallB
/// to not share any states with CallA. And if they do share states, then
/// its not possible for CallB to block CallA and CallA not to block CallB.
///
/// TODO: Sinking retains can block releases to be hoisted, and hoisting
/// releases can block retains to be sunk. Investigate when to sink retains and
/// when to hoist releases and their ordering in the pass pipeline.
///
/// TODO: Consider doing retain hoisting and release sinking. This can help
/// to discover disjoint lifetimes and we can try to stitch them together.
///
/// TODO: There are a lot of code duplications between retain and release code
/// motion in the data flow part. Consider whether we can share them.
/// Essentially, we can implement the release code motion by inverting the
/// retain code motion, but this can also make the code less readable.
///
//===----------------------------------------------------------------------===//

#define DEBUG_TYPE "pil-rr-code-motion"

#include "polarphp/pil/lang/InstructionUtils.h"
#include "polarphp/pil/lang/PILBuilder.h"
#include "polarphp/pil/optimizer/analysis/ARCAnalysis.h"
#include "polarphp/pil/optimizer/analysis/AliasAnalysis.h"
#include "polarphp/pil/optimizer/analysis/EscapeAnalysis.h"
#include "polarphp/pil/optimizer/analysis/PostOrderAnalysis.h"
#include "polarphp/pil/optimizer/analysis/ProgramTerminationAnalysis.h"
#include "polarphp/pil/optimizer/analysis/RCIdentityAnalysis.h"
#include "polarphp/pil/optimizer/passmgr/Transforms.h"
#include "polarphp/pil/optimizer/utils/CFGOptUtils.h"
#include "polarphp/pil/optimizer/utils/InstOptUtils.h"
#include "polarphp/global/NameStrings.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"

using namespace polar;

STATISTIC(NumRetainsSunk, "Number of retains sunk");
STATISTIC(NumReleasesHoisted, "Number of releases hoisted");

llvm::cl::opt<bool> DisableARCCodeMotion("disable-arc-cm", llvm::cl::init(false));

//===----------------------------------------------------------------------===//
//                             Block State
//===----------------------------------------------------------------------===//

struct BlockState {
   /// A bit vector for which the ith bit represents the ith refcounted root in
   /// RCRootVault.
   ///
   /// NOTE: we could do the data flow with BBSetIn or BBSetOut, but that would
   /// require us to create a temporary copy to check whether the BBSet has
   /// changed after the genset and killset has been applied.
   SmallBitVector BBSetIn;

   /// A bit vector for which the ith bit represents the ith refcounted root in
   /// RCRootVault.
   SmallBitVector BBSetOut;

   /// A bit vector for which the ith bit represents the ith refcounted root in
   /// RCRootVault. If the bit is set, that means this basic block creates a
   /// retain which can be sunk or a release which can be hoisted.
   SmallBitVector BBGenSet;

   /// A bit vector for which the ith bit represents the ith refcounted root in
   /// RCRootVault. If this bit is set, that means this basic block stops retain
   /// or release of the refcounted root to be moved across.
   SmallBitVector BBKillSet;

   /// A bit vector for which the ith bit represents the ith refcounted root in
   /// RCRootVault. If this bit is set, that means this is potentially a retain
   /// or release that can be sunk or hoisted to this point. This is used to
   /// optimize the time for computing genset and killset.
   ///
   /// NOTE: this vector contains an approximation of whether there will be a
   /// retain or release to a certain point of a basic block.
   SmallBitVector BBMaxSet;
};

/// CodeMotionContext - This is the base class which retain code motion and
/// release code motion inherits from. It defines an interface as to how the
/// code motion procedure should be.
class CodeMotionContext {
protected:
   /// Dataflow needs multiple iteration to converge. If this is false, then we
   /// do not need to generate the genset or killset, i.e. we can simply do 1
   /// pessimistic data flow iteration.
   bool MultiIteration;

   /// The allocator we are currently using.
   llvm::SpecificBumpPtrAllocator<BlockState> &BPA;

   /// Current function we are analyzing.
   PILFunction *F;

   /// Current post-order we are using.
   PostOrderFunctionInfo *PO;

   /// Current alias analysis we are using.
   AliasAnalysis *AA;

   /// Current rc-identity we are using.
   RCIdentityFunctionInfo *RCFI;

   /// All the unique refcount roots retained or released in the function.
   llvm::SetVector<PILValue> RCRootVault;

   /// Contains a map between RC roots to their index in the RCRootVault.
   /// used to facilitate fast RC roots to index lookup.
   llvm::DenseMap<PILValue, unsigned> RCRootIndex;

   /// All the retains or releases originally in the function. Eventually
   /// they will all be removed after all the new ones are generated.
   llvm::SmallPtrSet<PILInstruction *, 8> RCInstructions;

   /// All the places to place the new retains or releases after code motion.
   using InsertPointList = llvm::SmallVector<PILInstruction *, 2>;
   llvm::SmallDenseMap<PILValue, InsertPointList> InsertPoints;

   /// These are the blocks that have an RC instruction to process or it blocks
   /// some RC instructions. If the basic block has neither, we do not need to
   /// process the block again in the last iteration. We populate this set when
   /// we compute the genset and killset.
   llvm::SmallPtrSet<PILBasicBlock *, 8> InterestBlocks;

#ifndef NDEBUG
   // PILPrintContext is used to print block IDs in RPO order.
   // It is optional so only the final insertion point interference is printed.
   Optional<PILPrintContext> printCtx;
#endif

   /// Return the rc-identity root of the PILValue.
   PILValue getRCRoot(PILValue R) {
      return RCFI->getRCIdentityRoot(R);
   }

   /// Return the rc-identity root of the RC instruction, i.e.
   /// retain or release.
   PILValue getRCRoot(PILInstruction *I) {
      assert((isRetainInstruction(I) || isReleaseInstruction(I)) &&
                                       "Extracting RC root from invalid instruction");
      return getRCRoot(I->getOperand(0));
   }

public:
   /// Constructor.
   CodeMotionContext(llvm::SpecificBumpPtrAllocator<BlockState> &BPA,
                     PILFunction *F,
                     PostOrderFunctionInfo *PO, AliasAnalysis *AA,
                     RCIdentityFunctionInfo *RCFI)
      : MultiIteration(true), BPA(BPA), F(F), PO(PO), AA(AA), RCFI(RCFI) {}

   /// virtual destructor.
   virtual ~CodeMotionContext() {}

   /// Run the data flow to move retains and releases.
   bool run();

   /// Check whether we need to run an optimistic iteration data flow.
   /// or a pessimistic would suffice.
   virtual bool requireIteration() = 0;

   /// Initialize necessary things to run the iterative data flow.
   virtual void initializeCodeMotionDataFlow() = 0;

   /// Initialize the basic block maximum refcounted set.
   virtual void initializeCodeMotionBBMaxSet() = 0;

   /// Compute the genset and killset for every root in every basic block.
   virtual void computeCodeMotionGenKillSet() = 0;

   /// Run the iterative data flow to converge.
   virtual void convergeCodeMotionDataFlow() = 0;

   /// Use the data flow results, come up with places to insert the new inst.
   virtual void computeCodeMotionInsertPoints() = 0;

   /// Remove the old retains and create the new *moved* refcounted instructions
   virtual bool performCodeMotion() = 0;

   /// Merge the data flow states.
   virtual void mergeBBDataFlowStates(PILBasicBlock *BB) = 0;

   /// Compute the BBSetIn and BBSetOut for the current basic
   /// block with the generated gen and kill set.
   virtual bool processBBWithGenKillSet(PILBasicBlock *BB) = 0;

   /// Return true if the instruction blocks the Ptr to be moved further.
   virtual bool mayBlockCodeMotion(PILInstruction *II, PILValue Ptr) = 0;
};

bool CodeMotionContext::run() {
   MultiIteration = requireIteration();

   // Initialize the data flow.
   initializeCodeMotionDataFlow();

   // Converge the BBSetOut with iterative data flow.
   if (MultiIteration) {
      initializeCodeMotionBBMaxSet();
      computeCodeMotionGenKillSet();
      convergeCodeMotionDataFlow();
   }

   // Compute the insertion point where each RC root can be moved to.
   computeCodeMotionInsertPoints();

   // Finally, generate new retains and remove the old retains.
   return performCodeMotion();
}

//===----------------------------------------------------------------------===//
//                          Retain Code Motion
//===----------------------------------------------------------------------===//

class RetainBlockState : public BlockState {
public:
   /// Check whether the BBSetOut has changed. If it does, we need to rerun
   /// the data flow on this block's successors to reach fixed point.
   bool updateBBSetOut(SmallBitVector &X) {
      if (BBSetOut == X)
         return false;
      BBSetOut = X;
      return true;
   }

   /// constructor.
   RetainBlockState(bool IsEntry, unsigned size, bool MultiIteration) {
      // Iterative forward data flow.
      BBSetIn.resize(size, false);
      // Initialize to true if we are running optimistic data flow, i.e.
      // MultiIteration is true.
      BBSetOut.resize(size, MultiIteration);
      BBMaxSet.resize(size, !IsEntry && MultiIteration);

      // Genset and Killset are initially empty.
      BBGenSet.resize(size, false);
      BBKillSet.resize(size, false);
   }
};

/// RetainCodeMotionContext - Context to perform retain code motion.
class RetainCodeMotionContext : public CodeMotionContext {
   /// All the retain block state for all the basic blocks in the function.
   llvm::SmallDenseMap<PILBasicBlock *, RetainBlockState *> BlockStates;

   ProgramTerminationFunctionInfo PTFI;

   /// Return true if the instruction blocks the Ptr to be moved further.
   bool mayBlockCodeMotion(PILInstruction *II, PILValue Ptr) override {
      // NOTE: If more checks are to be added, place the most expensive in the
      // end, this function is called many times.
      //
      // These terminator instructions block.
      if (isa<ReturnInst>(II) || isa<ThrowInst>(II) || isa<UnwindInst>(II) ||
          isa<UnreachableInst>(II))
         return true;
      // Identical RC root blocks code motion, we will be able to move this retain
      // further once we move the blocking retain.
      if (isRetainInstruction(II) && getRCRoot(II) == Ptr) {
         LLVM_DEBUG(if (printCtx) llvm::dbgs()
               << "Retain " << Ptr << "  at matching retain " << *II);
         return true;
      }
      // Ref count checks do not have side effects, but are barriers for retains.
      if (mayCheckRefCount(II)) {
         LLVM_DEBUG(if (printCtx) llvm::dbgs()
               << "Retain " << Ptr << "  at refcount check " << *II);
         return true;
      }
      // mayDecrement reference count stops code motion.
      if (mayDecrementRefCount(II, Ptr, AA)) {
         LLVM_DEBUG(if (printCtx) llvm::dbgs()
               << "Retain " << Ptr << "  at may decrement " << *II);
         return true;
      }
      // This instruction does not block the retain code motion.
      return false;
   }

   /// Return the previous instruction if it happens to be a retain with the
   /// given RC root, nullptr otherwise.
   PILInstruction *getPrevReusableInst(PILInstruction *I, PILValue Root) {
      if (&*I->getParent()->begin() == I)
         return nullptr;
      auto Prev = &*std::prev(PILBasicBlock::iterator(I));
      if (isRetainInstruction(Prev) && getRCRoot(Prev) == Root)
         return Prev;
      return nullptr;
   }

public:
   /// Constructor.
   RetainCodeMotionContext(llvm::SpecificBumpPtrAllocator<BlockState> &BPA,
                           PILFunction *F, PostOrderFunctionInfo *PO,
                           AliasAnalysis *AA, RCIdentityFunctionInfo *RCFI)
      : CodeMotionContext(BPA, F, PO, AA, RCFI), PTFI(F) {}

   /// virtual destructor.
   ~RetainCodeMotionContext() override {}

   /// Return true if we do not need optimistic data flow.
   bool requireIteration() override;

   /// Initialize necessary things to run the iterative data flow.
   void initializeCodeMotionDataFlow() override;

   /// Initialize the basic block maximum refcounted set.
   void initializeCodeMotionBBMaxSet() override;

   /// Compute the genset and killset for every root in every basic block.
   void computeCodeMotionGenKillSet() override;

   /// Run the iterative data flow to converge.
   void convergeCodeMotionDataFlow() override;

   /// Use the data flow results, come up with places to insert the new inst.
   void computeCodeMotionInsertPoints() override;

   /// Remove the old retains and create the new *moved* refcounted instructions
   bool performCodeMotion() override;

   /// Compute the BBSetIn and BBSetOut for the current basic block with the
   /// generated gen and kill set.
   bool processBBWithGenKillSet(PILBasicBlock *BB) override;

   /// Merge the data flow states.
   void mergeBBDataFlowStates(PILBasicBlock *BB) override;
};

bool RetainCodeMotionContext::requireIteration() {
   // If all basic blocks will have their predecessors processed if the basic
   // blocks in the functions are iterated in reverse post order. Then this
   // function can be processed in one iteration, i.e. no need to generate the
   // genset and killset.
   llvm::SmallPtrSet<PILBasicBlock *, 4> PBBs;
   for (PILBasicBlock *B : PO->getReversePostOrder()) {
      for (auto X : B->getPredecessorBlocks()) {
         if (!PBBs.count(X))
            return true;
      }
      PBBs.insert(B);
   }
   return false;
}

void RetainCodeMotionContext::initializeCodeMotionDataFlow() {
   // Find all the RC roots in the function.
   for (auto &BB : *F) {
      for (auto &II : BB) {
         if (!isRetainInstruction(&II))
            continue;
         RCInstructions.insert(&II);
         PILValue Root = getRCRoot(&II);
         if (RCRootIndex.find(Root) != RCRootIndex.end())
            continue;
         RCRootIndex[Root] = RCRootVault.size();
         RCRootVault.insert(Root);
         LLVM_DEBUG(llvm::dbgs()
                       << "Retain Root #" << RCRootVault.size() << " " << Root);
      }
   }

   // Initialize all the data flow bit vector for all basic blocks.
   for (auto &BB : *F) {
      BlockStates[&BB] = new (BPA.Allocate())
         RetainBlockState(&BB == &*F->begin(),
                          RCRootVault.size(), MultiIteration);
   }
}

void RetainCodeMotionContext::initializeCodeMotionBBMaxSet() {
   for (PILBasicBlock *BB : PO->getReversePostOrder()) {
      // If basic block has no predecessor, do nothing.
      BlockState *State = BlockStates[BB];
      if (BB->pred_empty()) {
         State->BBMaxSet.reset();
      } else {
         // Intersect in all predecessors' BBSetOut.
         State->BBMaxSet.set();
         for (auto E = BB->pred_end(), I = BB->pred_begin(); I != E; ++I) {
            State->BBMaxSet &= BlockStates[*I]->BBMaxSet;
         }
      }

      // Process the instructions in the basic block to find what refcounted
      // roots are retained. If we know that an RC root can't be retained at a
      // basic block, then we know we do not need to consider it for the killset.
      // NOTE: this is a conservative approximation, because some retains may be
      // blocked before it reaches this block.
      for (auto &II : *BB) {
         if (!isRetainInstruction(&II))
            continue;
         State->BBMaxSet.set(RCRootIndex[getRCRoot(&II)]);
      }
   }
}

void RetainCodeMotionContext::computeCodeMotionGenKillSet() {
   for (PILBasicBlock *BB : PO->getReversePostOrder()) {
      auto *State = BlockStates[BB];
      bool InterestBlock = false;
      for (auto &I : *BB) {
         // Check whether this instruction blocks any RC root code motion.
         for (unsigned i = 0; i < RCRootVault.size(); ++i) {
            if (!State->BBMaxSet.test(i) || !mayBlockCodeMotion(&I, RCRootVault[i]))
               continue;
            // This is a blocking instruction for the rcroot.
            InterestBlock = true;
            State->BBKillSet.set(i);
            State->BBGenSet.reset(i);
         }
         // If this is a retain instruction, it also generates.
         if (isRetainInstruction(&I)) {
            unsigned idx = RCRootIndex[getRCRoot(&I)];
            State->BBGenSet.set(idx);
            assert(State->BBKillSet.test(idx) && "Killset computed incorrectly");
            State->BBKillSet.reset(idx);
            InterestBlock = true;
         }
      }

      // Is this a block that is interesting to the last iteration of the data
      // flow.
      if (!InterestBlock)
         continue;
      InterestBlocks.insert(BB);
   }
}

bool RetainCodeMotionContext::performCodeMotion() {
   bool Changed = false;
   // Create the new retain instructions.
   for (auto RC : RCRootVault) {
      auto Iter = InsertPoints.find(RC);
      if (Iter == InsertPoints.end())
         continue;

      for (auto IP : Iter->second) {
         // Check if the insertion point is in a block that we had previously
         // identified as a program termination point. In such a case, we know that
         // there are no releases or anything beyond a fatalError call. In such a
         // case, do not insert the retain. It is ok if we leak.
         if (PTFI.isProgramTerminatingBlock(IP->getParent()))
            continue;

         // We are about to insert a new retain instruction before the insertion
         // point. Check if the previous instruction is reusable, reuse it, do not
         // insert new instruction and delete old one.
         if (auto I = getPrevReusableInst(IP, Iter->first)) {
            RCInstructions.erase(I);
            continue;
         }
         createIncrementBefore(Iter->first, IP);
         Changed = true;
      }
   }
   // Remove the old retain instructions.
   for (auto R : RCInstructions) {
      ++NumRetainsSunk;
      recursivelyDeleteTriviallyDeadInstructions(R, true);
   }
   return Changed;
}

void RetainCodeMotionContext::mergeBBDataFlowStates(PILBasicBlock *BB) {
   BlockState *State = BlockStates[BB];
   State->BBSetIn.reset();
   // If basic block has no predecessor, simply reset and return.
   if (BB->pred_empty())
      return;

   // Intersect in all predecessors' BBSetOuts.
   auto Iter = BB->pred_begin();
   State->BBSetIn = BlockStates[*Iter]->BBSetOut;
   Iter = std::next(Iter);
   for (auto E = BB->pred_end(); Iter != E; ++Iter) {
      State->BBSetIn &= BlockStates[*Iter]->BBSetOut;
   }
}

bool RetainCodeMotionContext::processBBWithGenKillSet(PILBasicBlock *BB) {
   RetainBlockState *State = BlockStates[BB];
   // Compute the BBSetOut at the end of the basic block.
   mergeBBDataFlowStates(BB);

   // Compute the BBSetIn at the beginning of the basic block.
   State->BBSetIn.reset(State->BBKillSet);
   State->BBSetIn |= State->BBGenSet;

   // If BBSetIn changes, then keep iterating until reached a fixed point.
   return State->updateBBSetOut(State->BBSetIn);
}

void RetainCodeMotionContext::convergeCodeMotionDataFlow() {
   // Process each basic block with the genset and killset. Every time the
   // BBSetOut of a basic block changes, the optimization is rerun on its
   // successors.
   llvm::SmallVector<PILBasicBlock *, 16> WorkList;
   llvm::SmallPtrSet<PILBasicBlock *, 4> HandledBBs;
   // Push into reverse post order so that we can pop from the back and get
   // post order.
   for (PILBasicBlock *B : PO->getReversePostOrder()) {
      WorkList.push_back(B);
      HandledBBs.insert(B);
   }
   while (!WorkList.empty()) {
      PILBasicBlock *BB = WorkList.pop_back_val();
      HandledBBs.erase(BB);
      if (processBBWithGenKillSet(BB)) {
         for (auto &X : BB->getSuccessors()) {
            // We do not push basic block into the worklist if its already
            // in the worklist.
            if (HandledBBs.count(X))
               continue;
            WorkList.push_back(X);
         }
      }
   }
}

void RetainCodeMotionContext::computeCodeMotionInsertPoints() {
#ifndef NDEBUG
   printCtx.emplace(llvm::dbgs(), /*Verbose=*/false, /*Sorted=*/true);
#endif
   // The BBSetOuts have converged, run last iteration and figure out
   // insertion point for each refcounted root.
   for (PILBasicBlock *BB : PO->getReversePostOrder()) {
      mergeBBDataFlowStates(BB);
      RetainBlockState *S = BlockStates[BB];

      // Compute insertion point generated by the edge value transition.
      // If there is a transition from 1 to 0, that means we have a partial
      // merge, which means the retain can NOT be sunk to the current block,
      // so place it at the end of the predecessors.
      for (unsigned i = 0; i < RCRootVault.size(); ++i) {
         if (S->BBSetIn[i])
            continue;
         for (auto Pred : BB->getPredecessorBlocks()) {
            BlockState *PBB = BlockStates[Pred];
            if (!PBB->BBSetOut[i])
               continue;
            InsertPoints[RCRootVault[i]].push_back(Pred->getTerminator());
         }
      }

      // Is this block interesting. If we are sure this block does not generate
      // retains nor does it block any retains (i.e. no insertion point will be
      // created), we can skip it, as the BBSetOut has been converged if this is
      // a multi-iteration function.
      if (MultiIteration && !InterestBlocks.count(BB))
         continue;

      // Compute insertion point within the basic block. Process instructions in
      // the basic block in reverse post-order fashion.
      for (auto I = BB->begin(), E = BB->end(); I != E; ++I) {
         for (unsigned i = 0; i < RCRootVault.size(); ++i) {
            if (!S->BBSetIn[i] || !mayBlockCodeMotion(&*I, RCRootVault[i]))
               continue;
            S->BBSetIn.reset(i);
            InsertPoints[RCRootVault[i]].push_back(&*I);
         }

         // If this is a retain instruction, it also generates.
         if (isRetainInstruction(&*I)) {
            S->BBSetIn.set(RCRootIndex[getRCRoot(&*I)]);
         }
      }

      // Lastly update the BBSetOut, only necessary when we are running a single
      // iteration dataflow.
      if (!MultiIteration) {
         S->updateBBSetOut(S->BBSetIn);
      }
   }
}

//===----------------------------------------------------------------------===//
//                          Release Code Motion
//===----------------------------------------------------------------------===//

class ReleaseBlockState : public BlockState {
public:
   /// Check whether the BBSetIn has changed. If it does, we need to rerun
   /// the data flow on this block's predecessors to reach fixed point.
   bool updateBBSetIn(SmallBitVector &X) {
      if (BBSetIn == X)
         return false;
      BBSetIn = X;
      return true;
   }

   /// constructor.
   ///
   /// If \p InitOptimistic is true, the block in-bits are initialized to 1
   /// which enables optimistic data flow evaluation.
   ReleaseBlockState(bool InitOptimistic, unsigned size) {
      // backward data flow.
      // Initialize to true if we are running optimistic data flow, i.e.
      // MultiIteration is true.
      BBSetIn.resize(size, InitOptimistic);
      BBSetOut.resize(size, false);
      BBMaxSet.resize(size, InitOptimistic);

      // Genset and Killset are initially empty.
      BBGenSet.resize(size, false);
      BBKillSet.resize(size, false);
   }
};

/// ReleaseCodeMotionContext - Context to perform release code motion.
class ReleaseCodeMotionContext : public CodeMotionContext {
   /// All the release block state for all the basic blocks in the function.
   llvm::SmallDenseMap<PILBasicBlock *, ReleaseBlockState *> BlockStates;

   /// We are not moving epilogue releases.
   bool FreezeEpilogueReleases;

   /// The epilogue release matcher we are currently using.
   ConsumedArgToEpilogueReleaseMatcher &ERM;

   /// Return true if the instruction blocks the Ptr to be moved further.
   bool mayBlockCodeMotion(PILInstruction *II, PILValue Ptr) override {
      // NOTE: If more checks are to be added, place the most expensive in the end.
      // This function is called many times.
      //
      // We can not move a release above the instruction that defines the
      // released value.
      if (II == Ptr->getDefiningInstruction())
         return true;
      // Identical RC root blocks code motion, we will be able to move this release
      // further once we move the blocking release.
      if (isReleaseInstruction(II) && getRCRoot(II) == Ptr) {
         LLVM_DEBUG(if (printCtx) llvm::dbgs()
               << "Release " << Ptr << "  at matching release " << *II);
         return true;
      }
      // Stop at may interfere.
      if (mayHaveSymmetricInterference(II, Ptr, AA)) {
         LLVM_DEBUG(if (printCtx) llvm::dbgs()
               << "Release " << Ptr << "  at interference " << *II);
         return true;
      }
      // This instruction does not block the release.
      return false;
   }

   /// Return the successor instruction if it happens to be a release with the
   /// given RC root, nullptr otherwise.
   PILInstruction *getPrevReusableInst(PILInstruction *I, PILValue Root) {
      if (&*I->getParent()->begin() == I)
         return nullptr;
      auto Prev = &*std::prev(PILBasicBlock::iterator(I));
      if (isReleaseInstruction(Prev) && getRCRoot(Prev) == Root)
         return Prev;
      return nullptr;
   }

public:
   /// Constructor.
   ReleaseCodeMotionContext(llvm::SpecificBumpPtrAllocator<BlockState> &BPA,
                            PILFunction *F, PostOrderFunctionInfo *PO,
                            AliasAnalysis *AA, RCIdentityFunctionInfo *RCFI,
                            bool FreezeEpilogueReleases,
                            ConsumedArgToEpilogueReleaseMatcher &ERM)
      : CodeMotionContext(BPA, F, PO, AA, RCFI),
        FreezeEpilogueReleases(FreezeEpilogueReleases), ERM(ERM) {}

   /// virtual destructor.
   ~ReleaseCodeMotionContext() override {}

   /// Return true if the data flow can converge in 1 iteration.
   bool requireIteration() override;

   /// Initialize necessary things to run the iterative data flow.
   void initializeCodeMotionDataFlow() override;

   /// Initialize the basic block maximum refcounted set.
   void initializeCodeMotionBBMaxSet() override;

   /// Compute the genset and killset for every root in every basic block.
   void computeCodeMotionGenKillSet() override;

   /// Run the iterative data flow to converge.
   void convergeCodeMotionDataFlow() override;

   /// Use the data flow results, come up with places to insert the new inst.
   void computeCodeMotionInsertPoints() override;

   /// Remove the old retains and create the new *moved* refcounted instructions
   bool performCodeMotion() override;

   /// Compute the BBSetIn and BBSetOut for the current basic
   /// block with the generated gen and kill set.
   bool processBBWithGenKillSet(PILBasicBlock *BB) override;

   /// Merge the data flow states.
   void mergeBBDataFlowStates(PILBasicBlock *BB) override;
};

bool ReleaseCodeMotionContext::requireIteration() {
   // If all basic blocks will have their successors processed if the basic
   // blocks in the functions are iterated in post order. Then this function
   // can be processed in one iteration, i.e. no need to generate the genset
   // and killset.
   llvm::SmallPtrSet<PILBasicBlock *, 4> PBBs;
   for (PILBasicBlock *B : PO->getPostOrder()) {
      for (auto &X : B->getSuccessors()) {
         if (!PBBs.count(X))
            return true;
      }
      PBBs.insert(B);
   }
   return false;
}

void ReleaseCodeMotionContext::initializeCodeMotionDataFlow() {
   // All blocks which are initialized with 1-bits. These are all blocks which
   // eventually reach the function exit (return, throw), excluding the
   // function exit blocks themselves.
   // Optimistic initialization enables moving releases across loops. On the
   // other hand, blocks, which never reach the function exit, e.g. infinite
   // loop blocks, must be excluded. Otherwise we would end up inserting
   // completely unrelated release instructions in such blocks.
   llvm::SmallPtrSet<PILBasicBlock *, 32> BlocksInitOptimistically;

   llvm::SmallVector<PILBasicBlock *, 32> Worklist;

   // Find all the RC roots in the function.
   for (auto &BB : *F) {
      for (auto &II : BB) {
         if (!isReleaseInstruction(&II))
            continue;
         // Do not try to enumerate if we are not hoisting epilogue releases.
         if (FreezeEpilogueReleases && ERM.isEpilogueRelease(&II))
            continue;
         PILValue Root = getRCRoot(&II);
         RCInstructions.insert(&II);
         if (RCRootIndex.find(Root) != RCRootIndex.end())
            continue;
         RCRootIndex[Root] = RCRootVault.size();
         RCRootVault.insert(Root);
         LLVM_DEBUG(llvm::dbgs()
                       << "Release Root #" << RCRootVault.size() << " " << Root);
      }
      if (MultiIteration && BB.getTerminator()->isFunctionExiting())
         Worklist.push_back(&BB);
   }

   // Find all blocks from which there is a path to the function exit.
   // Note: the Worklist is empty if we are not in MultiIteration mode.
   while (!Worklist.empty()) {
      PILBasicBlock *BB = Worklist.pop_back_val();
      for (PILBasicBlock *Pred : BB->getPredecessorBlocks()) {
         if (BlocksInitOptimistically.insert(Pred).second)
            Worklist.push_back(Pred);
      }
   }

   // Initialize all the data flow bit vector for all basic blocks.
   for (auto &BB : *F) {
      BlockStates[&BB] = new (BPA.Allocate())
         ReleaseBlockState(BlocksInitOptimistically.count(&BB) != 0,
                           RCRootVault.size());
   }
}

void ReleaseCodeMotionContext::initializeCodeMotionBBMaxSet() {
   for (PILBasicBlock *BB : PO->getPostOrder()) {
      // If basic block has no successor, do nothing.
      BlockState *State = BlockStates[BB];
      if (BB->succ_empty()) {
         State->BBMaxSet.reset();
      } else {
         // Intersect in all successors' BBMaxOuts.
         State->BBMaxSet.set();
         for (auto E = BB->succ_end(), I = BB->succ_begin(); I != E; ++I) {
            State->BBMaxSet &= BlockStates[*I]->BBMaxSet;
         }
      }

      // Process the instructions in the basic block to find what refcounted
      // roots are released. If we know that an RC root can't be released at a
      // basic block, then we know we do not need to consider it for the killset.
      // NOTE: this is a conservative approximation, because some releases may be
      // blocked before it reaches this block.
      for (auto II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
         if (!isReleaseInstruction(&*II))
            continue;
         State->BBMaxSet.set(RCRootIndex[getRCRoot(&*II)]);
      }
   }
}

void ReleaseCodeMotionContext::computeCodeMotionGenKillSet() {
   for (PILBasicBlock *BB : PO->getPostOrder()) {
      auto *State = BlockStates[BB];
      bool InterestBlock = false;
      for (auto I = BB->rbegin(), E = BB->rend(); I != E; ++I) {
         // Check whether this instruction blocks any RC root code motion.
         for (unsigned i = 0; i < RCRootVault.size(); ++i) {
            if (!State->BBMaxSet.test(i) || !mayBlockCodeMotion(&*I, RCRootVault[i]))
               continue;
            // This instruction blocks this RC root.
            InterestBlock = true;
            State->BBKillSet.set(i);
            State->BBGenSet.reset(i);
         }

         // If this is an epilogue release and we are freezing epilogue release
         // simply continue.
         if (FreezeEpilogueReleases && ERM.isEpilogueRelease(&*I))
            continue;

         // If this is a release instruction, it also generates.
         if (isReleaseInstruction(&*I)) {
            unsigned idx = RCRootIndex[getRCRoot(&*I)];
            State->BBGenSet.set(idx);
            assert(State->BBKillSet.test(idx) && "Killset computed incorrectly");
            State->BBKillSet.reset(idx);
            InterestBlock = true;
         }
      }

      // Handle PILArgument, PILArgument can invalidate.
      for (unsigned i = 0; i < RCRootVault.size(); ++i) {
         auto *A = dyn_cast<PILArgument>(RCRootVault[i]);
         if (!A || A->getParent() != BB)
            continue;
         InterestBlock = true;
         State->BBKillSet.set(i);
         State->BBGenSet.reset(i);
      }

      // Is this interesting to the last iteration of the data flow.
      if (!InterestBlock)
         continue;
      InterestBlocks.insert(BB);
   }
}

void ReleaseCodeMotionContext::mergeBBDataFlowStates(PILBasicBlock *BB) {
   BlockState *State = BlockStates[BB];
   State->BBSetOut.reset();
   // If basic block has no successor, simply reset and return.
   if (BB->succ_empty())
      return;

   // Intersect in all successors' BBSetIn.
   auto Iter = BB->succ_begin();
   State->BBSetOut = BlockStates[*Iter]->BBSetIn;
   Iter = std::next(Iter);
   for (auto E = BB->succ_end(); Iter != E; ++Iter) {
      State->BBSetOut &= BlockStates[*Iter]->BBSetIn;
   }
}

bool ReleaseCodeMotionContext::performCodeMotion() {
   bool Changed = false;
   SmallVector<PILInstruction *, 8> NewReleases;
   // Create the new releases at each anchor point.
   for (auto RC : RCRootVault) {
      auto Iter = InsertPoints.find(RC);
      if (Iter == InsertPoints.end())
         continue;
      for (auto IP : Iter->second) {
         // we are about to insert a new release instruction before the insertion
         // point. Check if the successor instruction is reusable, reuse it, do
         // not insert new instruction and delete old one.
         if (auto I = getPrevReusableInst(IP, Iter->first)) {
            if (RCInstructions.erase(I))
               NewReleases.push_back(I);
            continue;
         }
         if (PILInstruction *I = createDecrementBefore(Iter->first, IP).getPtrOrNull())
            NewReleases.push_back(I);
         Changed = true;
      }
   }
   // Remove the old release instructions.
   for (auto R : RCInstructions) {
      ++NumReleasesHoisted;
      recursivelyDeleteTriviallyDeadInstructions(R, true);
   }

   // Eliminate pairs of retain-release if they are adjacent to each other and
   // retain/release the same RCRoot, e.g.
   //    strong_retain %2
   //    strong_release %2
   for (PILInstruction *ReleaseInst : NewReleases) {
      auto InstIter = ReleaseInst->getIterator();
      if (InstIter == ReleaseInst->getParent()->begin())
         continue;

      PILInstruction *PrevInst = &*std::prev(InstIter);
      if (isRetainInstruction(PrevInst) && getRCRoot(PrevInst) == getRCRoot(ReleaseInst)) {
         recursivelyDeleteTriviallyDeadInstructions(PrevInst, true);
         recursivelyDeleteTriviallyDeadInstructions(ReleaseInst, true);
      }
   }

   return Changed;
}

bool ReleaseCodeMotionContext::processBBWithGenKillSet(PILBasicBlock *BB) {
   ReleaseBlockState *State = BlockStates[BB];
   // Compute the BBSetOut at the end of the basic block.
   mergeBBDataFlowStates(BB);

   // Compute the BBSetIn at the beginning of the basic block.
   State->BBSetOut.reset(State->BBKillSet);
   State->BBSetOut |= State->BBGenSet;

   // If BBSetIn changes, then keep iterating until reached a fixed point.
   return State->updateBBSetIn(State->BBSetOut);
}

void ReleaseCodeMotionContext::convergeCodeMotionDataFlow() {
   // Process each basic block with the gen and kill set. Every time the
   // BBSetIn of a basic block changes, the optimization is rerun on its
   // predecessors.
   llvm::SmallVector<PILBasicBlock *, 16> WorkList;
   llvm::SmallPtrSet<PILBasicBlock *, 8> HandledBBs;
   // Push into reverse post order so that we can pop from the back and get
   // post order.
   for (PILBasicBlock *B : PO->getPostOrder()) {
      WorkList.push_back(B);
      HandledBBs.insert(B);
   }
   while (!WorkList.empty()) {
      PILBasicBlock *BB = WorkList.pop_back_val();
      HandledBBs.erase(BB);
      if (processBBWithGenKillSet(BB)) {
         for (auto X : BB->getPredecessorBlocks()) {
            // We do not push basic block into the worklist if its already
            // in the worklist.
            if (HandledBBs.count(X))
               continue;
            WorkList.push_back(X);
         }
      }
   }
}

void ReleaseCodeMotionContext::computeCodeMotionInsertPoints() {
#ifndef NDEBUG
   printCtx.emplace(llvm::dbgs(), /*Verbose=*/false, /*Sorted=*/true);
#endif

   // The BBSetIns have converged, run last iteration and figure out insertion
   // point for each RC root.
   for (PILBasicBlock *BB : PO->getPostOrder()) {
      // Intersect in the successor BBSetIns.
      mergeBBDataFlowStates(BB);
      ReleaseBlockState *S = BlockStates[BB];

      // Compute insertion point generated by the edge value transition.
      // If there is a transition from 1 to 0, that means we have a partial
      // merge, which means the release can NOT be hoisted to the current block.
      // place it at the successors.
      for (unsigned i = 0; i < RCRootVault.size(); ++i) {
         if (S->BBSetOut[i])
            continue;
         for (auto &Succ : BB->getSuccessors()) {
            BlockState *SBB = BlockStates[Succ];
            if (!SBB->BBSetIn[i])
               continue;
            InsertPoints[RCRootVault[i]].push_back(&*(*Succ).begin());
            LLVM_DEBUG(llvm::dbgs()
                          << "Release partial merge. Insert at successor: "
                          << printCtx->getID(BB) << " " << RCRootVault[i]);
         }
      }

      // Is this block interesting ?
      if (MultiIteration && !InterestBlocks.count(BB))
         continue;

      // Compute insertion point generated by MayUse terminator inst.
      // If terminator instruction can block the RC root. We will have no
      // choice but to anchor the release instructions in the successor blocks.
      for (unsigned i = 0; i < RCRootVault.size(); ++i) {
         PILInstruction *Term = BB->getTerminator();
         if (!S->BBSetOut[i] || !mayBlockCodeMotion(Term, RCRootVault[i]))
            continue;
         for (auto &Succ : BB->getSuccessors()) {
            BlockState *SBB = BlockStates[Succ];
            if (!SBB->BBSetIn[i])
               continue;
            InsertPoints[RCRootVault[i]].push_back(&*(*Succ).begin());
            LLVM_DEBUG(llvm::dbgs()
                          << "Release terminator use. Insert at successor: "
                          << printCtx->getID(BB) << " " << RCRootVault[i]);
         }
         S->BBSetOut.reset(i);
      }

      // Compute insertion point generated within the basic block. Process
      // instructions in post-order fashion.
      for (auto I = std::next(BB->rbegin()), E = BB->rend(); I != E; ++I) {
         for (unsigned i = 0; i < RCRootVault.size(); ++i) {
            if (!S->BBSetOut[i] || !mayBlockCodeMotion(&*I, RCRootVault[i]))
               continue;
            auto *InsertPt = &*std::next(PILBasicBlock::iterator(&*I));
            InsertPoints[RCRootVault[i]].push_back(InsertPt);
            S->BBSetOut.reset(i);
         }

         // If we are freezing this epilogue release. Simply continue.
         if (FreezeEpilogueReleases && ERM.isEpilogueRelease(&*I))
            continue;

         // This release generates.
         if (isReleaseInstruction(&*I)) {
            S->BBSetOut.set(RCRootIndex[getRCRoot(&*I)]);
         }
      }

      // Compute insertion point generated by PILArgument. PILArgument blocks if
      // it defines the released value.
      for (unsigned i = 0; i < RCRootVault.size(); ++i) {
         if (!S->BBSetOut[i])
            continue;
         auto *A = dyn_cast<PILArgument>(RCRootVault[i]);
         if (!A || A->getParent() != BB)
            continue;
         InsertPoints[RCRootVault[i]].push_back(&*BB->begin());
         S->BBSetOut.reset(i);
      }

      // Lastly update the BBSetIn, only necessary when we are running a single
      // iteration dataflow.
      if (!MultiIteration) {
         S->updateBBSetIn(S->BBSetOut);
      }
   }
}

//===----------------------------------------------------------------------===//
//            Eliminate Retains Before Program Termination Points
//===----------------------------------------------------------------------===//

static void eliminateRetainsPrecedingProgramTerminationPoints(PILFunction *f) {
   for (auto &block : *f) {
      auto *term = block.getTerminator();
      // If we don't have an unreachable or an unreachable that is the only
      // element in the block, bail.
      if (!isa<UnreachableInst>(term) || term == &*block.begin())
         continue;

      auto iter = std::prev(term->getIterator());

      // If we have an apply next, see if it is a program termination point. In
      // such a case, we can ignore it. All other functions though imply we must
      // bail. If we don't have a function here, check for side
      if (auto apply = FullApplySite::isa(&*iter)) {
         if (!apply.isCalleeKnownProgramTerminationPoint()) {
            continue;
         }
      } else {
         // If we didn't have an apply, move back onto the unreachable so that we
         // can begin the loop in a proper state where we the current position of
         // the iterator has already been tested.
         ++iter;
      }

      while (iter != block.begin()) {
         // Move iter back to the prev instruction and see if iter is a retain
         // instruction. If it is not, then break out of the loop. We found a
         // non-retain instruction so can not optimize further since we do not want
         // to shorten the lifetime of any values that may be used before the
         // program termination.
         --iter;

         // First check if iter has side-effects. If iter doesn't have
         // side-effects, then ignore it.
         //
         // TODO: Use SideEffectsAnalysis here.
         if (!iter->mayHaveSideEffects())
            continue;

         if (!isa<StrongRetainInst>(&*iter) && !isa<RetainValueInst>(&*iter)) {
            break;
         }

         // Since we are going to delete this instruction, we grab the pointer to
         // the instruction, move iter to the prev instruction and erase the
         // instruction.
         auto *i = &*iter;
         auto tmp = prev_or_default(iter, block.begin(), block.end());
         i->eraseFromParent();

         // If tmp is the end of the block, then we wrapped... break out of the
         // loop we did all of the work that we could.
         if (tmp == block.end())
            break;

         // Otherwise, set iter to point at the next instruction.
         iter = std::next(tmp);
      }
   }
}

//===----------------------------------------------------------------------===//
//                           Top Level Entry Point
//===----------------------------------------------------------------------===//

namespace {

/// Code motion kind.
enum CodeMotionKind : unsigned { Retain = 0, Release = 1};

class ARCCodeMotion : public PILFunctionTransform {
   /// Whether to hoist releases or sink retains.
   CodeMotionKind Kind;

   /// Freeze epilogue release or not.
   bool FreezeEpilogueReleases;

public:
   /// Constructor.
   ARCCodeMotion(CodeMotionKind H, bool F) : Kind(H), FreezeEpilogueReleases(F) {}

   /// The entry point to the transformation.
   void run() override {
      // Code motion disabled.
      if (DisableARCCodeMotion)
         return;

      // Respect function no.optimize.
      PILFunction *F = getFunction();
      if (!F->shouldOptimize())
         return;

      // FIXME: Support ownership.
      if (F->hasOwnership())
         return;

      LLVM_DEBUG(llvm::dbgs() << "*** ARCCM on function: " << F->getName()
                              << " ***\n");

      PostOrderAnalysis *POA = PM->getAnalysis<PostOrderAnalysis>();

      // Split all critical edges.
      //
      // TODO: maybe we can do this lazily or maybe we should disallow PIL passes
      // to create critical edges.
      bool EdgeChanged = splitAllCriticalEdges(*F, nullptr, nullptr);
      if (EdgeChanged)
         POA->invalidateFunction(F);

      auto *PO = POA->get(F);
      auto *AA = PM->getAnalysis<AliasAnalysis>();
      auto *RCFI = PM->getAnalysis<RCIdentityAnalysis>()->get(F);

      llvm::SpecificBumpPtrAllocator<BlockState> BPA;
      bool InstChanged = false;
      if (Kind == Release) {
         // TODO: we should consider Throw block as well, or better we should
         // abstract the Return block or Throw block away in the matcher.
         PILArgumentConvention Conv[] = {PILArgumentConvention::Direct_Owned};
         ConsumedArgToEpilogueReleaseMatcher ERM(RCFI, F,
                                                 Conv,
                                                 ConsumedArgToEpilogueReleaseMatcher::ExitKind::Return);

         ReleaseCodeMotionContext RelCM(BPA, F, PO, AA, RCFI,
                                        FreezeEpilogueReleases, ERM);
         // Run release hoisting.
         InstChanged |= RelCM.run();
      } else {
         RetainCodeMotionContext RetCM(BPA, F, PO, AA, RCFI);
         // Run retain sinking.
         InstChanged |= RetCM.run();
         // Eliminate any retains that are right before program termination
         // points. We assume that any retains before semantic calls marked as
         // program termination points can be eliminated since by assumption we are
         // going to be leaking these objects and any releases that were afterwards
         // were already eliminated. Assuming that the IR is correctly balanced
         // from an ARC perspective.
         eliminateRetainsPrecedingProgramTerminationPoints(F);
      }

      if (EdgeChanged) {
         // We splitted critical edges.
         invalidateAnalysis(PILAnalysis::InvalidationKind::FunctionBody);
         return;
      }
      if (InstChanged) {
         // We moved instructions.
         invalidateAnalysis(PILAnalysis::InvalidationKind::Instructions);
      }
   }
};

} // end anonymous namespace

/// Sink Retains.
PILTransform *polar::createRetainSinking() {
   return new ARCCodeMotion(CodeMotionKind::Retain, false);
}

/// Hoist releases, but not epilogue release. ASO relies on epilogue releases
/// to prove knownsafety on enclosed releases.
PILTransform *polar::createReleaseHoisting() {
   return new ARCCodeMotion(CodeMotionKind::Release, true);
}

/// Hoist all releases.
PILTransform *polar::createLateReleaseHoisting() {
   return new ARCCodeMotion(CodeMotionKind::Release, false);
}
